import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import os
import warnings
warnings.filterwarnings('ignore')
creditcard_df=pd.read_csv('D:\\data science\\Batch67 Day28\\Projects\\Project -5 Marketting Department\\Marketing_data.csv')
creditcard_df
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | C10001 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | C10002 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | C10003 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | C10004 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | NaN | 0.000000 | 12 |
| 4 | C10005 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | C19186 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | C19187 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | NaN | 0.000000 | 6 |
| 8947 | C19188 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | C19189 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | C19190 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 18 columns
creditcard_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8950 entries, 0 to 8949 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CUST_ID 8950 non-null object 1 BALANCE 8950 non-null float64 2 BALANCE_FREQUENCY 8950 non-null float64 3 PURCHASES 8950 non-null float64 4 ONEOFF_PURCHASES 8950 non-null float64 5 INSTALLMENTS_PURCHASES 8950 non-null float64 6 CASH_ADVANCE 8950 non-null float64 7 PURCHASES_FREQUENCY 8950 non-null float64 8 ONEOFF_PURCHASES_FREQUENCY 8950 non-null float64 9 PURCHASES_INSTALLMENTS_FREQUENCY 8950 non-null float64 10 CASH_ADVANCE_FREQUENCY 8950 non-null float64 11 CASH_ADVANCE_TRX 8950 non-null int64 12 PURCHASES_TRX 8950 non-null int64 13 CREDIT_LIMIT 8949 non-null float64 14 PAYMENTS 8950 non-null float64 15 MINIMUM_PAYMENTS 8637 non-null float64 16 PRC_FULL_PAYMENT 8950 non-null float64 17 TENURE 8950 non-null int64 dtypes: float64(14), int64(3), object(1) memory usage: 1.2+ MB
creditcard_df.describe()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8950.000000 | 8949.000000 | 8950.000000 | 8637.000000 | 8950.000000 | 8950.000000 |
| mean | 1564.474828 | 0.877271 | 1003.204834 | 592.437371 | 411.067645 | 978.871112 | 0.490351 | 0.202458 | 0.364437 | 0.135144 | 3.248827 | 14.709832 | 4494.449450 | 1733.143852 | 864.206542 | 0.153715 | 11.517318 |
| std | 2081.531879 | 0.236904 | 2136.634782 | 1659.887917 | 904.338115 | 2097.163877 | 0.401371 | 0.298336 | 0.397448 | 0.200121 | 6.824647 | 24.857649 | 3638.815725 | 2895.063757 | 2372.446607 | 0.292499 | 1.338331 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 50.000000 | 0.000000 | 0.019163 | 0.000000 | 6.000000 |
| 25% | 128.281915 | 0.888889 | 39.635000 | 0.000000 | 0.000000 | 0.000000 | 0.083333 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1600.000000 | 383.276166 | 169.123707 | 0.000000 | 12.000000 |
| 50% | 873.385231 | 1.000000 | 361.280000 | 38.000000 | 89.000000 | 0.000000 | 0.500000 | 0.083333 | 0.166667 | 0.000000 | 0.000000 | 7.000000 | 3000.000000 | 856.901546 | 312.343947 | 0.000000 | 12.000000 |
| 75% | 2054.140036 | 1.000000 | 1110.130000 | 577.405000 | 468.637500 | 1113.821139 | 0.916667 | 0.300000 | 0.750000 | 0.222222 | 4.000000 | 17.000000 | 6500.000000 | 1901.134317 | 825.485459 | 0.142857 | 12.000000 |
| max | 19043.138560 | 1.000000 | 49039.570000 | 40761.250000 | 22500.000000 | 47137.211760 | 1.000000 | 1.000000 | 1.000000 | 1.500000 | 123.000000 | 358.000000 | 30000.000000 | 50721.483360 | 76406.207520 | 1.000000 | 12.000000 |
#check who made one off purchase of $40761 ie maximum ONEOFF_PURCHASES
creditcard_df[creditcard_df['ONEOFF_PURCHASES']==40761.25]
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 550 | C10574 | 11547.52001 | 1.0 | 49039.57 | 40761.25 | 8278.32 | 558.166886 | 1.0 | 1.0 | 0.916667 | 0.083333 | 1 | 101 | 22500.0 | 46930.59824 | 2974.069421 | 0.25 | 12 |
# Check who made cash advance of $47137
# This customer made 123 cash advance transactions
# Never paid credit card in full
creditcard_df[creditcard_df['CASH_ADVANCE']==47137.211760]
| CUST_ID | BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2159 | C12226 | 10905.05381 | 1.0 | 431.93 | 133.5 | 298.43 | 47137.21176 | 0.583333 | 0.25 | 0.5 | 1.0 | 123 | 21 | 19600.0 | 39048.59762 | 5394.173671 | 0.0 | 12 |
creditcard_df.isna().sum()
CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 313 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
import seaborn as sns
# Heat map for missing data
sns.heatmap(creditcard_df.isnull(), yticklabels = False, cbar = False, cmap="Blues")
<AxesSubplot:>
#Fill up the missing elements with mean of the 'MINIMUM_PAYMENT'
creditcard_df.loc[(creditcard_df['MINIMUM_PAYMENTS'].isnull() == True), 'MINIMUM_PAYMENTS'] = creditcard_df['MINIMUM_PAYMENTS'].mean()
creditcard_df.isna().sum()
CUST_ID 0 BALANCE 0 BALANCE_FREQUENCY 0 PURCHASES 0 ONEOFF_PURCHASES 0 INSTALLMENTS_PURCHASES 0 CASH_ADVANCE 0 PURCHASES_FREQUENCY 0 ONEOFF_PURCHASES_FREQUENCY 0 PURCHASES_INSTALLMENTS_FREQUENCY 0 CASH_ADVANCE_FREQUENCY 0 CASH_ADVANCE_TRX 0 PURCHASES_TRX 0 CREDIT_LIMIT 1 PAYMENTS 0 MINIMUM_PAYMENTS 0 PRC_FULL_PAYMENT 0 TENURE 0 dtype: int64
# Fill up the missing elements with mean of the 'CREDIT_LIMIT'
creditcard_df.loc[(creditcard_df['CREDIT_LIMIT'].isnull() == True), 'CREDIT_LIMIT'] = creditcard_df['CREDIT_LIMIT'].mean()
()
()
creditcard_df.duplicated().sum()
0
sns.heatmap(creditcard_df.isnull(),yticklabels=False,cbar=False,cmap='Blues')
<AxesSubplot:>
creditcard_df.shape
(8950, 18)
creditcard_df.drop('CUST_ID',axis=1,inplace=True)
display(creditcard_df)
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | 864.206542 | 0.000000 | 6 |
| 8947 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 17 columns
len(creditcard_df.columns)
17
creditcard_df.columns
Index(['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES', 'ONEOFF_PURCHASES',
'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE', 'PURCHASES_FREQUENCY',
'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
'CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'PURCHASES_TRX',
'CREDIT_LIMIT', 'PAYMENTS', 'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT',
'TENURE'],
dtype='object')
# distplot combines the matplotlib.hist function with seaborn kdeplot()
# KDE Plot represents the Kernel Density Estimate
# KDE is used for visualizing the Probability Density of a continuous variable.
# KDE demonstrates the probability density at different values in a continuous variable.
# Mean of balance is $1500
# 'Balance_Frequency' for most customers is updated frequently ~1
# For 'PURCHASES_FREQUENCY', there are two distinct group of customers
# For 'ONEOFF_PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY' most users don't do one off puchases or installment purchases frequently
# Very small number of customers pay their balance in full 'PRC_FULL_PAYMENT'~0
# Credit limit average is around $4500
# Most customers are ~11 years tenure
plt.figure(figsize=(20,80))
for i in range(len(creditcard_df.columns)):
plt.subplot(17, 1, i+1)
sns.distplot(creditcard_df[creditcard_df.columns[i]], kde_kws={"color": "b", "lw": 3, "label": "KDE"}, hist_kws={"color": "g"})
plt.title(creditcard_df.columns[i])
plt.tight_layout()
sns.pairplot(creditcard_df)
# Correlation between 'PURCHASES' and ONEOFF_PURCHASES & INSTALMENT_PURCHASES
# Trend between 'PURCHASES' and 'CREDIT_LIMIT' & 'PAYMENTS'
<seaborn.axisgrid.PairGrid at 0x2019424c610>
correlations = creditcard_df.corr()
display (correlations )
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| BALANCE | 1.000000 | 0.322412 | 0.181261 | 0.164350 | 0.126469 | 0.496692 | -0.077944 | 0.073166 | -0.063186 | 0.449218 | 0.385152 | 0.154338 | 0.531267 | 0.322802 | 0.394282 | -0.318959 | 0.072692 |
| BALANCE_FREQUENCY | 0.322412 | 1.000000 | 0.133674 | 0.104323 | 0.124292 | 0.099388 | 0.229715 | 0.202415 | 0.176079 | 0.191873 | 0.141555 | 0.189626 | 0.095795 | 0.065008 | 0.114249 | -0.095082 | 0.119776 |
| PURCHASES | 0.181261 | 0.133674 | 1.000000 | 0.916845 | 0.679896 | -0.051474 | 0.393017 | 0.498430 | 0.315567 | -0.120143 | -0.067175 | 0.689561 | 0.356959 | 0.603264 | 0.093515 | 0.180379 | 0.086288 |
| ONEOFF_PURCHASES | 0.164350 | 0.104323 | 0.916845 | 1.000000 | 0.330622 | -0.031326 | 0.264937 | 0.524891 | 0.127729 | -0.082628 | -0.046212 | 0.545523 | 0.319721 | 0.567292 | 0.048597 | 0.132763 | 0.064150 |
| INSTALLMENTS_PURCHASES | 0.126469 | 0.124292 | 0.679896 | 0.330622 | 1.000000 | -0.064244 | 0.442418 | 0.214042 | 0.511351 | -0.132318 | -0.073999 | 0.628108 | 0.256496 | 0.384084 | 0.131687 | 0.182569 | 0.086143 |
| CASH_ADVANCE | 0.496692 | 0.099388 | -0.051474 | -0.031326 | -0.064244 | 1.000000 | -0.215507 | -0.086754 | -0.177070 | 0.628522 | 0.656498 | -0.075850 | 0.303983 | 0.453238 | 0.139223 | -0.152935 | -0.068312 |
| PURCHASES_FREQUENCY | -0.077944 | 0.229715 | 0.393017 | 0.264937 | 0.442418 | -0.215507 | 1.000000 | 0.501343 | 0.862934 | -0.308478 | -0.203478 | 0.568430 | 0.119778 | 0.103464 | 0.002976 | 0.305802 | 0.061506 |
| ONEOFF_PURCHASES_FREQUENCY | 0.073166 | 0.202415 | 0.498430 | 0.524891 | 0.214042 | -0.086754 | 0.501343 | 1.000000 | 0.142329 | -0.111716 | -0.069088 | 0.544869 | 0.295030 | 0.243537 | -0.029963 | 0.157531 | 0.082466 |
| PURCHASES_INSTALLMENTS_FREQUENCY | -0.063186 | 0.176079 | 0.315567 | 0.127729 | 0.511351 | -0.177070 | 0.862934 | 0.142329 | 1.000000 | -0.262958 | -0.169207 | 0.529975 | 0.060752 | 0.085551 | 0.029590 | 0.250087 | 0.073275 |
| CASH_ADVANCE_FREQUENCY | 0.449218 | 0.191873 | -0.120143 | -0.082628 | -0.132318 | 0.628522 | -0.308478 | -0.111716 | -0.262958 | 1.000000 | 0.799561 | -0.131168 | 0.132616 | 0.183192 | 0.097898 | -0.249773 | -0.133372 |
| CASH_ADVANCE_TRX | 0.385152 | 0.141555 | -0.067175 | -0.046212 | -0.073999 | 0.656498 | -0.203478 | -0.069088 | -0.169207 | 0.799561 | 1.000000 | -0.066157 | 0.149699 | 0.255278 | 0.109185 | -0.169784 | -0.043421 |
| PURCHASES_TRX | 0.154338 | 0.189626 | 0.689561 | 0.545523 | 0.628108 | -0.075850 | 0.568430 | 0.544869 | 0.529975 | -0.131168 | -0.066157 | 1.000000 | 0.272877 | 0.370832 | 0.095858 | 0.162066 | 0.121874 |
| CREDIT_LIMIT | 0.531267 | 0.095795 | 0.356959 | 0.319721 | 0.256496 | 0.303983 | 0.119778 | 0.295030 | 0.060752 | 0.132616 | 0.149699 | 0.272877 | 1.000000 | 0.421852 | 0.125134 | 0.055671 | 0.139034 |
| PAYMENTS | 0.322802 | 0.065008 | 0.603264 | 0.567292 | 0.384084 | 0.453238 | 0.103464 | 0.243537 | 0.085551 | 0.183192 | 0.255278 | 0.370832 | 0.421852 | 1.000000 | 0.125046 | 0.112138 | 0.106136 |
| MINIMUM_PAYMENTS | 0.394282 | 0.114249 | 0.093515 | 0.048597 | 0.131687 | 0.139223 | 0.002976 | -0.029963 | 0.029590 | 0.097898 | 0.109185 | 0.095858 | 0.125134 | 0.125046 | 1.000000 | -0.139674 | 0.057257 |
| PRC_FULL_PAYMENT | -0.318959 | -0.095082 | 0.180379 | 0.132763 | 0.182569 | -0.152935 | 0.305802 | 0.157531 | 0.250087 | -0.249773 | -0.169784 | 0.162066 | 0.055671 | 0.112138 | -0.139674 | 1.000000 | -0.016486 |
| TENURE | 0.072692 | 0.119776 | 0.086288 | 0.064150 | 0.086143 | -0.068312 | 0.061506 | 0.082466 | 0.073275 | -0.133372 | -0.043421 | 0.121874 | 0.139034 | 0.106136 | 0.057257 | -0.016486 | 1.000000 |
f, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(correlations, annot = True)
# 'PURCHASES' have high correlation between one-off purchases, 'installment purchases, purchase transactions, credit limit and payments.
# Strong Positive Correlation between 'PURCHASES_FREQUENCY' and 'PURCHASES_INSTALLMENT_FREQUENCY'
<AxesSubplot:>
display (creditcard_df)
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.40 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.00 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.00 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.00 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.00 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | 28.493517 | 1.000000 | 291.12 | 0.00 | 291.12 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 325.594462 | 48.886365 | 0.500000 | 6 |
| 8946 | 19.183215 | 1.000000 | 300.00 | 0.00 | 300.00 | 0.000000 | 1.000000 | 0.000000 | 0.833333 | 0.000000 | 0 | 6 | 1000.0 | 275.861322 | 864.206542 | 0.000000 | 6 |
| 8947 | 23.398673 | 0.833333 | 144.40 | 0.00 | 144.40 | 0.000000 | 0.833333 | 0.000000 | 0.666667 | 0.000000 | 0 | 5 | 1000.0 | 81.270775 | 82.418369 | 0.250000 | 6 |
| 8948 | 13.457564 | 0.833333 | 0.00 | 0.00 | 0.00 | 36.558778 | 0.000000 | 0.000000 | 0.000000 | 0.166667 | 2 | 0 | 500.0 | 52.549959 | 55.755628 | 0.250000 | 6 |
| 8949 | 372.708075 | 0.666667 | 1093.25 | 1093.25 | 0.00 | 127.040008 | 0.666667 | 0.666667 | 0.000000 | 0.333333 | 2 | 23 | 1200.0 | 63.165404 | 88.288956 | 0.000000 | 6 |
8950 rows × 17 columns
# Let's scale the data first
scaler = StandardScaler()
creditcard_df_scaled = scaler.fit_transform(creditcard_df)
creditcard_df_scaled.shape
(8950, 17)
display(pd.DataFrame(creditcard_df_scaled))
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.731989 | -0.249434 | -0.424900 | -0.356934 | -0.349079 | -0.466786 | -0.806490 | -0.678661 | -0.707313 | -0.675349 | -0.476070 | -0.511333 | -0.960433 | -0.528979 | -3.109675e-01 | -0.525551 | 0.360680 |
| 1 | 0.786961 | 0.134325 | -0.469552 | -0.356934 | -0.454576 | 2.605605 | -1.221758 | -0.678661 | -0.916995 | 0.573963 | 0.110074 | -0.591796 | 0.688639 | 0.818642 | 8.931021e-02 | 0.234227 | 0.360680 |
| 2 | 0.447135 | 0.518084 | -0.107668 | 0.108889 | -0.454576 | -0.466786 | 1.269843 | 2.673451 | -0.916995 | -0.675349 | -0.476070 | -0.109020 | 0.826062 | -0.383805 | -1.016632e-01 | -0.525551 | 0.360680 |
| 3 | 0.049099 | -1.016953 | 0.232058 | 0.546189 | -0.454576 | -0.368653 | -1.014125 | -0.399319 | -0.916995 | -0.258913 | -0.329534 | -0.551565 | 0.826062 | -0.598688 | -4.390474e-16 | -0.525551 | 0.360680 |
| 4 | -0.358775 | 0.518084 | -0.462063 | -0.347294 | -0.454576 | -0.466786 | -1.014125 | -0.399319 | -0.916995 | -0.675349 | -0.476070 | -0.551565 | -0.905464 | -0.364368 | -2.657913e-01 | -0.525551 | 0.360680 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | -0.737950 | 0.518084 | -0.333293 | -0.356934 | -0.132643 | -0.466786 | 1.269843 | -0.678661 | 1.179833 | -0.675349 | -0.476070 | -0.350408 | -0.960433 | -0.486217 | -3.498541e-01 | 1.183951 | -4.122768 |
| 8946 | -0.742423 | 0.518084 | -0.329136 | -0.356934 | -0.122823 | -0.466786 | 1.269843 | -0.678661 | 1.179833 | -0.675349 | -0.476070 | -0.350408 | -0.960433 | -0.503396 | -4.390474e-16 | -0.525551 | -4.122768 |
| 8947 | -0.740398 | -0.185477 | -0.401965 | -0.356934 | -0.294893 | -0.466786 | 0.854576 | -0.678661 | 0.760469 | -0.675349 | -0.476070 | -0.390639 | -0.960433 | -0.570615 | -3.354655e-01 | 0.329200 | -4.122768 |
| 8948 | -0.745174 | -0.185477 | -0.469552 | -0.356934 | -0.454576 | -0.449352 | -1.221758 | -0.678661 | -0.916995 | 0.157527 | -0.182998 | -0.591796 | -1.097856 | -0.580536 | -3.469065e-01 | 0.329200 | -4.122768 |
| 8949 | -0.572575 | -0.889033 | 0.042146 | 0.301732 | -0.454576 | -0.406205 | 0.439310 | 1.556082 | -0.916995 | 0.990398 | -0.182998 | 0.333524 | -0.905464 | -0.576869 | -3.329464e-01 | -0.525551 | -4.122768 |
8950 rows × 17 columns
scores_1 = []
range_values = range(1, 20)
for i in range_values:
kmeans = KMeans(n_clusters = i)
kmeans.fit(creditcard_df_scaled)
scores_1.append(kmeans.inertia_)
plt.plot(scores_1, 'bx-')
plt.title('Finding the right number of clusters')
plt.xlabel('Clusters')
plt.ylabel('Scores')
plt.show()
# From this we can observe that, 4th cluster seems to be forming the elbow of the curve.
# However, the values does not reduce linearly until 8th cluster.
# Let's choose the number of clusters to be 7.
kmeans = KMeans(8)
kmeans.fit(creditcard_df_scaled)
labels = kmeans.labels_
kmeans.cluster_centers_.shape
(8, 17)
cluster_centers = pd.DataFrame(data = kmeans.cluster_centers_, columns = [creditcard_df.columns])
display(cluster_centers )
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.366639 | 0.333155 | -0.038243 | -0.244205 | 0.358036 | -0.363598 | 0.991214 | -0.386909 | 1.206355 | -0.475154 | -0.361158 | 0.187123 | -0.260825 | -0.217616 | -0.045775 | 0.315794 | 0.257381 |
| 1 | 1.690200 | 0.396003 | -0.219127 | -0.157223 | -0.229345 | 2.020153 | -0.480727 | -0.214389 | -0.417038 | 1.925066 | 1.939912 | -0.268180 | 1.027024 | 0.808261 | 0.528696 | -0.395008 | 0.069894 |
| 2 | -0.162873 | 0.392754 | 0.473201 | 0.617653 | -0.016010 | -0.333808 | 0.944809 | 1.880640 | 0.090573 | -0.408068 | -0.324367 | 0.532389 | 0.383794 | 0.100567 | -0.162365 | 0.414080 | 0.262038 |
| 3 | -0.701163 | -2.144787 | -0.311233 | -0.235823 | -0.302542 | -0.319756 | -0.557150 | -0.445187 | -0.440134 | -0.520622 | -0.376063 | -0.419936 | -0.175963 | -0.192408 | -0.256478 | 0.283762 | 0.199336 |
| 4 | 0.934901 | 0.467049 | 2.252953 | 1.707412 | 2.190297 | -0.192704 | 1.160751 | 1.548096 | 1.257221 | -0.310075 | -0.208712 | 2.800594 | 1.235324 | 1.282259 | 0.575854 | 0.273318 | 0.333736 |
| 5 | 1.923051 | 0.337717 | 11.212042 | 10.600367 | 7.033118 | 0.419625 | 1.046983 | 1.915501 | 0.981334 | -0.258912 | 0.061229 | 5.362438 | 3.044064 | 8.098975 | 1.120318 | 1.110132 | 0.310863 |
| 6 | -0.336050 | -0.347078 | -0.289267 | -0.215966 | -0.286835 | 0.068284 | -0.203078 | -0.288661 | -0.224549 | 0.308663 | 0.000996 | -0.388117 | -0.567159 | -0.392680 | -0.209145 | 0.014011 | -3.203733 |
| 7 | 0.016269 | 0.403662 | -0.361585 | -0.246994 | -0.401079 | -0.090629 | -0.865326 | -0.409276 | -0.757165 | 0.110146 | -0.024439 | -0.486185 | -0.306840 | -0.249619 | -0.009712 | -0.456880 | 0.272704 |
# In order to understand what these numbers mean, perform inverse transformation
cluster_centers = scaler.inverse_transform(cluster_centers)
cluster_centers = pd.DataFrame(data = cluster_centers, columns = [creditcard_df.columns])
display(cluster_centers)
# First Customers cluster (Transactors): Those are customers who pay least amount of intrerest charges and careful with their money, Cluster with lowest balance ($104) and cash advance ($303), Percentage of full payment = 23%
# Second customers cluster (revolvers) who use credit card as a loan (most lucrative sector): highest balance ($5000) and cash advance (~$5000), low purchase frequency, high cash advance frequency (0.5), high cash advance transactions (16) and low percentage of full payment (3%)
# Third customer cluster (VIP/Prime): high credit limit $16K and highest percentage of full payment, target for increase credit limit and increase spending habits
# Fourth customer cluster (low tenure): these are customers with low tenure (7 years), low balance
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 801.347031 | 0.956192 | 921.498712 | 187.107663 | 734.835246 | 216.389570 | 0.888173 | 0.087035 | 0.843874 | 0.040061 | 0.784187 | 19.361014 | 3545.460071 | 1103.167378 | 757.529897 | 0.246079 | 11.861760 |
| 1 | 5082.483731 | 0.971080 | 535.035681 | 331.479330 | 203.673753 | 5215.225625 | 0.297412 | 0.138501 | 0.198696 | 0.520370 | 16.487298 | 8.043880 | 8231.183078 | 4072.979303 | 2096.311415 | 0.038182 | 11.610855 |
| 2 | 1225.469199 | 0.970310 | 2014.205154 | 1617.614991 | 396.590163 | 278.861158 | 0.869548 | 0.763489 | 0.400433 | 0.053486 | 1.035262 | 27.943038 | 5890.847444 | 2024.274180 | 485.822410 | 0.274826 | 11.867993 |
| 3 | 105.063837 | 0.369191 | 338.250830 | 201.019898 | 137.482244 | 308.328143 | 0.266740 | 0.069650 | 0.189517 | 0.030962 | 0.682472 | 4.271804 | 3854.224762 | 1176.141747 | 266.495312 | 0.236710 | 11.784081 |
| 4 | 3510.393168 | 0.987910 | 5816.674519 | 3426.390962 | 2391.725865 | 574.762188 | 0.956216 | 0.664285 | 0.864089 | 0.073095 | 1.824519 | 84.322115 | 8989.062500 | 5445.158665 | 2206.210364 | 0.233656 | 11.963942 |
| 5 | 5567.142164 | 0.957273 | 24957.905000 | 18186.875667 | 6771.029333 | 1858.844605 | 0.910556 | 0.773889 | 0.754444 | 0.083333 | 3.666667 | 148.000000 | 15570.000000 | 25178.882690 | 3475.059479 | 0.478409 | 11.933333 |
| 6 | 865.015978 | 0.795051 | 385.181720 | 233.977974 | 151.686061 | 1122.064941 | 0.408846 | 0.116344 | 0.275196 | 0.196911 | 3.255627 | 5.062701 | 2430.891398 | 596.373827 | 376.802926 | 0.157813 | 7.229904 |
| 7 | 1598.338053 | 0.972895 | 230.672016 | 182.477546 | 48.376545 | 788.817093 | 0.143054 | 0.080363 | 0.063520 | 0.157186 | 3.082046 | 2.625092 | 3378.041452 | 1010.522498 | 841.572105 | 0.020085 | 11.882266 |
display(labels.shape) # Labels associated to each data point
display (labels.max())
display (labels.min())
(8950,)
7
0
y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
display(y_kmeans)
array([0, 2, 6, ..., 7, 7, 7])
# concatenate the clusters labels to our original dataframe
creditcard_df_cluster = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
creditcard_df_cluster.head()
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 | 7 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 | 1 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 | 2 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 | 7 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 | 7 |
for i in creditcard_df.columns:
plt.figure(figsize = (20,5))
for j in range(8):
plt.subplot(1,8,j+1)
cluster = creditcard_df_cluster[creditcard_df_cluster['cluster'] == j]
cluster[i].hist(bins = 20)
plt.title('{} \nCluster {} '.format(i,j))
plt.show()
pca = PCA(n_components=2)
principal_comp = pca.fit_transform(creditcard_df_scaled)
display(principal_comp)
array([[-1.68222241, -1.0764406 ],
[-1.13830449, 2.50651402],
[ 0.96968867, -0.38355405],
...,
[-0.92620335, -1.81078101],
[-2.33655014, -0.65796128],
[-0.55641626, -0.40049209]])
# Create a dataframe with the two components
pca_df = pd.DataFrame(data = principal_comp, columns =['pca1','pca2'])
pca_df.head()
| pca1 | pca2 | |
|---|---|---|
| 0 | -1.682222 | -1.076441 |
| 1 | -1.138304 | 2.506514 |
| 2 | 0.969689 | -0.383554 |
| 3 | -0.873631 | 0.043177 |
| 4 | -1.599435 | -0.688573 |
# Concatenate the clusters labels to the dataframe
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
display(pca_df)
| pca1 | pca2 | cluster | |
|---|---|---|---|
| 0 | -1.682222 | -1.076441 | 7 |
| 1 | -1.138304 | 2.506514 | 1 |
| 2 | 0.969689 | -0.383554 | 2 |
| 3 | -0.873631 | 0.043177 | 7 |
| 4 | -1.599435 | -0.688573 | 7 |
| ... | ... | ... | ... |
| 8945 | -0.359629 | -2.016141 | 6 |
| 8946 | -0.564367 | -1.639127 | 6 |
| 8947 | -0.926203 | -1.810781 | 6 |
| 8948 | -2.336550 | -0.657961 | 6 |
| 8949 | -0.556416 | -0.400492 | 6 |
8950 rows × 3 columns
pca_df.value_counts(pca_df.cluster)
cluster 7 2718 0 2011 3 1181 2 1106 1 866 6 622 4 416 5 30 dtype: int64
plt.figure(figsize=(20,10))
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','pink','yellow','gray','purple', 'black'])
plt.show()
from tensorflow.keras.layers import Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.initializers import glorot_uniform # This is normalizer
from keras.optimizers import SGD
encoding_dim = 7
input_df = Input(shape=(17,)) # 17 Features
# Glorot normal initializer (Xavier normal initializer) draws samples from a truncated normal distribution
x = Dense(encoding_dim, activation='relu')(input_df)
x = Dense(500, activation='relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(500, activation='relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation='relu', kernel_initializer = 'glorot_uniform')(x)
encoded = Dense(10, activation='relu', kernel_initializer = 'glorot_uniform')(x)
x = Dense(2000, activation='relu', kernel_initializer = 'glorot_uniform')(encoded)
x = Dense(500, activation='relu', kernel_initializer = 'glorot_uniform')(x)
decoded = Dense(17, kernel_initializer = 'glorot_uniform')(x)
# autoencoder
autoencoder = Model(input_df, decoded)
#encoder - used for our dimention reduction
encoder = Model(input_df, encoded)
autoencoder.compile(optimizer= 'adam', loss='mean_squared_error')
display (creditcard_df_scaled.shape)
(8950, 17)
autoencoder.fit(creditcard_df_scaled, creditcard_df_scaled, batch_size = 128, epochs = 25, verbose = 1)
Epoch 1/25 70/70 [==============================] - 6s 56ms/step - loss: 0.5177 Epoch 2/25 70/70 [==============================] - 4s 55ms/step - loss: 0.2855 Epoch 3/25 70/70 [==============================] - 4s 56ms/step - loss: 0.2301 Epoch 4/25 70/70 [==============================] - 4s 56ms/step - loss: 0.2059 Epoch 5/25 70/70 [==============================] - 4s 55ms/step - loss: 0.1784 Epoch 6/25 70/70 [==============================] - 4s 55ms/step - loss: 0.1590 Epoch 7/25 70/70 [==============================] - 4s 57ms/step - loss: 0.1439 Epoch 8/25 70/70 [==============================] - 4s 55ms/step - loss: 0.1253 Epoch 9/25 70/70 [==============================] - 4s 55ms/step - loss: 0.1150 Epoch 10/25 70/70 [==============================] - 4s 55ms/step - loss: 0.1067 Epoch 11/25 70/70 [==============================] - 4s 55ms/step - loss: 0.1011 Epoch 12/25 70/70 [==============================] - 4s 55ms/step - loss: 0.0952 Epoch 13/25 70/70 [==============================] - 4s 57ms/step - loss: 0.0918 Epoch 14/25 70/70 [==============================] - 4s 60ms/step - loss: 0.0892 Epoch 15/25 70/70 [==============================] - 4s 58ms/step - loss: 0.0837 Epoch 16/25 70/70 [==============================] - 4s 56ms/step - loss: 0.0784 Epoch 17/25 70/70 [==============================] - 4s 56ms/step - loss: 0.0726 Epoch 18/25 70/70 [==============================] - 4s 55ms/step - loss: 0.0701 Epoch 19/25 70/70 [==============================] - 4s 63ms/step - loss: 0.0723 Epoch 20/25 70/70 [==============================] - 4s 60ms/step - loss: 0.0617 Epoch 21/25 70/70 [==============================] - 4s 58ms/step - loss: 0.0586 Epoch 22/25 70/70 [==============================] - 4s 57ms/step - loss: 0.0572 Epoch 23/25 70/70 [==============================] - 4s 57ms/step - loss: 0.0554 Epoch 24/25 70/70 [==============================] - 4s 55ms/step - loss: 0.0566 Epoch 25/25 70/70 [==============================] - 4s 55ms/step - loss: 0.0517
<keras.callbacks.History at 0x201b02968e0>
autoencoder.summary()
Model: "model"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_1 (InputLayer) [(None, 17)] 0
dense (Dense) (None, 7) 126
dense_1 (Dense) (None, 500) 4000
dense_2 (Dense) (None, 500) 250500
dense_3 (Dense) (None, 2000) 1002000
dense_4 (Dense) (None, 10) 20010
dense_5 (Dense) (None, 2000) 22000
dense_6 (Dense) (None, 500) 1000500
dense_7 (Dense) (None, 17) 8517
=================================================================
Total params: 2,307,653
Trainable params: 2,307,653
Non-trainable params: 0
_________________________________________________________________
autoencoder.save_weights('autoencoder.h5')
pred = encoder.predict(creditcard_df_scaled)
display (pd.DataFrame(pred))
280/280 [==============================] - 2s 8ms/step
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.000000 | 0.457111 | 0.890344 | 1.717090 | 0.967631 | 1.056988 | 0.0 | 0.0 | 0.000000 | 0.000000 |
| 1 | 1.246969 | 2.804320 | 2.092428 | 1.288674 | 1.347486 | 2.334675 | 0.0 | 0.0 | 0.057597 | 0.000000 |
| 2 | 0.740157 | 0.416786 | 0.131792 | 0.308030 | 1.390511 | 0.371790 | 0.0 | 0.0 | 1.168417 | 0.766312 |
| 3 | 0.174244 | 1.038864 | 1.484888 | 0.739221 | 2.466419 | 0.986579 | 0.0 | 0.0 | 0.394535 | 0.000000 |
| 4 | 0.000000 | 0.711937 | 0.688485 | 1.269932 | 0.978676 | 1.060225 | 0.0 | 0.0 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8945 | 0.064023 | 0.380342 | 0.521539 | 0.457458 | 0.077806 | 1.276789 | 0.0 | 0.0 | 0.094928 | 1.314105 |
| 8946 | 0.023236 | 0.158743 | 0.112626 | 0.606525 | 0.384708 | 1.096888 | 0.0 | 0.0 | 0.255362 | 1.441767 |
| 8947 | 0.000000 | 0.241459 | 0.576213 | 0.383459 | 0.225735 | 1.201463 | 0.0 | 0.0 | 0.139119 | 1.057619 |
| 8948 | 0.136915 | 0.013083 | 0.560873 | 0.189760 | 0.429158 | 1.643312 | 0.0 | 0.0 | 0.014472 | 0.058017 |
| 8949 | 0.087425 | 0.000000 | 0.407008 | 0.000000 | 0.697960 | 1.725392 | 0.0 | 0.0 | 1.392876 | 1.378196 |
8950 rows × 10 columns
pred.shape
(8950, 10)
scores_2 = []
range_values = range(1, 20)
for i in range_values:
kmeans = KMeans(n_clusters= i)
kmeans.fit(pred)
scores_2.append(kmeans.inertia_)
plt.plot(scores_2, 'bx-')
plt.title('Finding right number of clusters')
plt.xlabel('Clusters')
plt.ylabel('scores')
plt.show()
plt.plot(scores_1, 'bx-', color = 'r')
plt.plot(scores_2, 'bx-', color = 'g')
[<matplotlib.lines.Line2D at 0x201ae3dfc40>]
kmeans = KMeans(4)
kmeans.fit(pred)
labels = kmeans.labels_
y_kmeans = kmeans.fit_predict(creditcard_df_scaled)
display(labels.shape) # Labels associated to each data point
display (labels.max())
display (labels.min())
(8950,)
3
0
df_cluster_dr = pd.concat([creditcard_df, pd.DataFrame({'cluster':labels})], axis = 1)
display(df_cluster_dr.head())
| BALANCE | BALANCE_FREQUENCY | PURCHASES | ONEOFF_PURCHASES | INSTALLMENTS_PURCHASES | CASH_ADVANCE | PURCHASES_FREQUENCY | ONEOFF_PURCHASES_FREQUENCY | PURCHASES_INSTALLMENTS_FREQUENCY | CASH_ADVANCE_FREQUENCY | CASH_ADVANCE_TRX | PURCHASES_TRX | CREDIT_LIMIT | PAYMENTS | MINIMUM_PAYMENTS | PRC_FULL_PAYMENT | TENURE | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 40.900749 | 0.818182 | 95.40 | 0.00 | 95.4 | 0.000000 | 0.166667 | 0.000000 | 0.083333 | 0.000000 | 0 | 2 | 1000.0 | 201.802084 | 139.509787 | 0.000000 | 12 | 0 |
| 1 | 3202.467416 | 0.909091 | 0.00 | 0.00 | 0.0 | 6442.945483 | 0.000000 | 0.000000 | 0.000000 | 0.250000 | 4 | 0 | 7000.0 | 4103.032597 | 1072.340217 | 0.222222 | 12 | 2 |
| 2 | 2495.148862 | 1.000000 | 773.17 | 773.17 | 0.0 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0 | 12 | 7500.0 | 622.066742 | 627.284787 | 0.000000 | 12 | 0 |
| 3 | 1666.670542 | 0.636364 | 1499.00 | 1499.00 | 0.0 | 205.788017 | 0.083333 | 0.083333 | 0.000000 | 0.083333 | 1 | 1 | 7500.0 | 0.000000 | 864.206542 | 0.000000 | 12 | 0 |
| 4 | 817.714335 | 1.000000 | 16.00 | 16.00 | 0.0 | 0.000000 | 0.083333 | 0.083333 | 0.000000 | 0.000000 | 0 | 1 | 1200.0 | 678.334763 | 244.791237 | 0.000000 | 12 | 0 |
pca = PCA(n_components=2)
prin_comp = pca.fit_transform(pred)
pca_df = pd.DataFrame(data = prin_comp, columns =['pca1','pca2'])
display (pca_df.head())
| pca1 | pca2 | |
|---|---|---|
| 0 | -1.055746 | -0.363532 |
| 1 | 1.007257 | -1.402239 |
| 2 | -0.484735 | 0.673778 |
| 3 | -0.125583 | -0.421430 |
| 4 | -1.085654 | -0.393209 |
pca_df = pd.concat([pca_df,pd.DataFrame({'cluster':labels})], axis = 1)
pca_df.head()
| pca1 | pca2 | cluster | |
|---|---|---|---|
| 0 | -1.055746 | -0.363532 | 0 |
| 1 | 1.007257 | -1.402239 | 2 |
| 2 | -0.484735 | 0.673778 | 0 |
| 3 | -0.125583 | -0.421430 | 0 |
| 4 | -1.085654 | -0.393209 | 0 |
pca_df.value_counts(pca_df.cluster)
cluster 0 6310 2 1573 1 1000 3 67 dtype: int64
plt.figure(figsize=(10,10))
ax = sns.scatterplot(x="pca1", y="pca2", hue = "cluster", data = pca_df, palette =['red','green','blue','yellow'])
plt.show()